# Load all necessary packages here
library(tidyverse)
library(readxl)
library(tigris)
library(leaflet)
library(htmltools)
library(htmlwidgets)
library(scales)
library(plotly)
library(moderndive)
library(ggpubr)
library(leaflet.extras)


# create a vector of all the states in the West region of the U.S.
West = c("WA", "OR", "ID", "MT", "WY", "CO", 
         "UT", "NV", "CA", "AK", "HI")

# create a vector of all the states in the Southwest region of the U.S.
Southwest = c("AZ", "NM", "TX", "OK")

# create a vector of all the states in the Southeast region of the U.S.
Southeast = c("AR", "LA", "MS", "AL", "GA", "FL", "SC", 
              "TN", "NC", "KY", "VA", "WV", "DC", "DE", "MD")

# create a vector of all the states in the Northeast region of the U.S.
Northeast = c("NJ", "CT", "RI", "PA", "NY", "MA", "NH", "VT", "ME")

# create a vector of all the states in the Midwest region of the U.S.
Midwest = c("ND", "SD", "NE", "KS", "MO", "IA", "MN", 
            "WI", "IL", "IN", "OH", "MI")


# make a dataframe for our first dataset (poverty and household income)
# filter for year 2010
# select variables of importance
# to get ready to join: rename a column
poverty_income <- read_excel("state_series_1980-2014.xls") %>%
  filter(year == 2010) %>%
  select(fips, state_code, median_hhinc, percent_pov, total_pop) %>%
  rename("FIPST" = fips)

# make a dataframe for our second dataset (dropout rates)
# select variables of importance
dropoutRates <- read_excel("sdr091a.xls") %>%
  select(FIPST, STATENAME, DRP912,
         TOTDHI, TOTDBL, TOTDWH, TOTDAS, TOTDAM)

# join by FIPS state numeric code
# remove District of Columbia
# add a new variable called "region" to the dataframe based on what region the state is in
joinedDataFull <- inner_join(dropoutRates, poverty_income, by = "FIPST") %>%
  filter(!(STATENAME == "District of Columbia")) %>%
  mutate(region = case_when(state_code %in% West ~ "West",
                            state_code %in% Southwest ~ "Southwest",
                            state_code %in% Southeast ~ "Southeast", 
                            state_code %in% Northeast ~ "Northeast",
                            state_code %in% Midwest ~ "Midwest"))

Introduction

We used the state dropout data1 from the Common Core of Data (CCD) surveys that are submitted every year to the National Center for Education Statistics (NCES).

We also used demographic and economic data2. This data came from the U.S. Bureau of Labor Statistics Local Area Statistics Project, U.S. Census Bureau Small Area Income and Poverty Estimates, and U.S. Census Bureau Population and Housing Estimates. We chose to focus on median household income, poverty level, and total resident population (all people who are usually residents of a specific state3).

Interactive Stacked Barplot

We created an interactive stacked barplot4 and chose one common way to divide the United States into five regions5. To easily compare the heights of the different colors between the bars, you can hover your mouse over a color. The popup label will tell you the total population for a given U.S. region and the number of dropouts for a particular race and region.

# group by region
# count: number of Hispanic drop outs, number of Black drop outs,
# number of White drop outs, number of Asian/Hawaiian Native/Pacific Islander drop outs,
# number of American Indian/Alaska Native drop outs, total population
barplotData <- joinedDataFull %>%
  group_by(region) %>% 
  summarize(SUMHI = sum(TOTDHI), SUMBL = sum(TOTDBL), SUMWH = sum(TOTDWH),
            SUMAS = sum(TOTDAS), SUMAM = sum(TOTDAM), SUMPOP = sum(total_pop))


# make stacked barplot
plot_ly(data = barplotData, x = ~region, y = ~SUMHI, 
        type = 'bar', name = 'Hispanic',
        text = paste("Total Region Population:", comma(barplotData$SUMPOP)),
        marker = list(color = 'rgb(0,0,128)')) %>%
  add_trace(y = ~SUMBL, name = 'Black',
            marker = list(color = 'rgb(30,144,255)')) %>%
  add_trace(y = ~SUMWH, name = 'White',
            marker = list(color = 'rgb(135,206,250)')) %>%
  add_trace(y = ~SUMAS, name = 'Asian/Hawaiian Native/Pacific Islander',
            marker = list(color = 'rgb(0,191,255)')) %>%
  add_trace(y = ~SUMAM, name = 'American Indian/Alaska Native',
            marker = list(color = 'rgb(20, 106, 162)')) %>%
  layout(title ="Total High School Dropouts by Race in the 2009-2010 School Year",
         yaxis = list(title = 'Number of Dropouts', tickformat = ",d"), 
         xaxis = list(title = 'U.S. Region', categoryorder = "array",
                      categoryarray = c("West", "Southeast", "Midwest", 
                                        "Southwest", "Northeast")),
         barmode = 'stack',
         legend = list(x = 100, y = 0.5)) %>%
  config(displayModeBar = FALSE)

The descending order of the barplot6 allows us to easily see that the West has the largest number of dropouts and the Northeast has the fewest. However, this could be due to the fact that the West has a larger population than the Northeast. However, this reasoning doesn’t explain why the West has more dropouts than the Southeast or Midwest since both of these regions have larger populations than the West.

Interactive Choropleth Map

We created an interactive choropleth map7 which is colored by 2009-2010 dropout rates. Clicking on a state will allow you to see the state’s total population, high school dropout rate, percent of people living in poverty, and the median household income. We start our interactive map centered on the U.S. at zoom level 4. The icon with four arrows8 will bring you back to to this setting.

# load spatial data
states <- states()

# inner join spatial data and a dataframe
states_merged <- geo_join(states, joinedDataFull, "STUSPS", "state_code", how = "inner")

# make blue color palette based on the range of dropout rate numbers
pal_DRP912 <- colorNumeric("Blues", domain=states_merged$DRP912)

# make popup labels
popup_label <- paste0("<strong>", states_merged$NAME, 
                      "</strong><br />Total Population: ", 
                      comma(states_merged$total_pop),
                      "<br />Dropout Rate: ", 
                      paste(format(round(states_merged$DRP912, 2), nsmall = 2), "%", 
                            sep = ""),
                      "<br />Percent Poverty: ", 
                      paste(states_merged$percent_pov, "%", sep = ""),
                      "<br />Median Household Income: ", 
                      comma(states_merged$median_hhinc))
# make interactive map
# at start: center the map on the U.S.
# add Earth icon to set to zoom level 4
leaflet(states) %>%
  addProviderTiles("CartoDB.Positron") %>%
  setView(-98.483330, 38.712046, zoom = 4) %>% 
  addPolygons(data = states_merged, 
              fillColor = ~pal_DRP912(states_merged$DRP912), 
              fillOpacity = 0.7, 
              weight = 0.2, 
              smoothFactor = 0.2, 
              highlight = highlightOptions(weight = 5, color = "#666",
                                           fillOpacity = 0.7, bringToFront = TRUE),
              popup = ~popup_label,
              label = states_merged$NAME) %>%
  addLegend(pal = pal_DRP912, 
            values = states_merged$DRP912, 
            position = "bottomright", 
            title = "Dropout Rate",
            labFormat = labelFormat(suffix = "%")) %>%
  addResetMapButton()

From the map, we can see that Arizona is the state with the highest 2010 dropout rate (7.8%) and New Hampshire is the state with the lowest dropout rate (1.2%).

Regression

Simple Linear Regression

We created two simple linear regressions9 to see the relationships between variables and dropout rates.

# scatterplot of poverty vs. total dropout rates
# add regression lines
povertyplot <- ggplot(joinedDataFull, aes(x = DRP912, y = percent_pov, color = region)) +
  geom_point(aes(size = total_pop)) +
  scale_color_manual(values = c('#000080', '#1E90FF', '#87CEFA', '#00BFFF', '#146AA2')) +
  labs(x = "Dropout Rate", y = "Percent of People Living in Poverty", 
       color = "U.S. Region", size = "Total Population",
       title = "Relationship between
       Dropout Rates and Poverty Level") +
  geom_smooth(method = "lm", se = FALSE) +
  theme(text = element_text(size=14)) +
  scale_y_continuous(labels = function(x) paste0(x, "%"))

# slrmodel_poverty <-  lm(DRP912~percent_pov, data = joinedDataFull)
# summary(slrmodel_poverty)



# scatterplot of median income vs. total dropout rates
# add regression lines
incomeplot <- ggplot(joinedDataFull, aes(x = DRP912, y = median_hhinc, color = region)) +
  geom_point(aes(size = total_pop)) +
  scale_color_manual(values = c('#000080', '#1E90FF', '#87CEFA', '#00BFFF', '#146AA2')) +
  labs(x = "Dropout Rate", y = "Median Household Income", 
       color = "U.S. Region", size = "Total Population",
       title = "Relationship between
       Dropout Rates and Income") +
  geom_smooth(method = "lm", se = FALSE) +
  scale_y_continuous(label = comma) +
  theme(text = element_text(size = 14))

# slrmodel_income <-  lm(DRP912~median_hhinc, data = joinedDataFull)
# summary(slrmodel_income)


# combine the two simple regressions in one figure
ggarrange(povertyplot, incomeplot, ncol = 2, nrow = 1, 
          common.legend = TRUE, legend = "bottom")

Explain relations (signs and coefficients)

multipleregressionmodel <- lm(DRP912~percent_pov+median_hhinc, joinedDataFull)
summary(multipleregressionmodel)
## 
## Call:
## lm(formula = DRP912 ~ percent_pov + median_hhinc, data = joinedDataFull)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4518 -1.0091 -0.3034  0.7761  3.4140 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  -8.546e+00  4.065e+00  -2.102  0.04090 * 
## percent_pov   4.274e-01  1.257e-01   3.400  0.00138 **
## median_hhinc  1.165e-04  4.778e-05   2.438  0.01862 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.436 on 47 degrees of freedom
## Multiple R-squared:  0.2064, Adjusted R-squared:  0.1726 
## F-statistic: 6.112 on 2 and 47 DF,  p-value: 0.004371
get_regression_table(multipleregressionmodel)
term estimate std_error statistic p_value lower_ci upper_ci
intercept -8.546 4.065 -2.102 0.041 -16.724 -0.369
percent_pov 0.427 0.126 3.400 0.001 0.175 0.680
median_hhinc 0.000 0.000 2.438 0.019 0.000 0.000
#plot(multipleregressionmodel)

Interactive Multiple Regression Plot

# colors for each region in plot
colors <- c('#000080', '#1E90FF', '#87CEFA', '#00BFFF', '#146AA2')


scatterplot <- plot_ly(joinedDataFull, 
                       x = ~percent_pov, 
                       y = ~DRP912, 
                       z = ~median_hhinc, 
                       color = ~region,
                       colors = colors,
                       hoverinfo = 'text',
                       text = ~paste('Percent Poverty:', 
                                     paste(percent_pov, "%", sep = ""), 
                                     '<br>Dropout Rate:', 
                                     paste(format(round(DRP912, 2), nsmall = 2), "%", 
                                           sep = ""), 
                                     '<br>Median Household Income:', 
                                     comma(median_hhinc))) %>%
  add_markers() %>%
  layout(scene = list(xaxis = list(title = 'Dropout Rate'),
                      yaxis = list(title = 'Percent Poverty'),
                      zaxis = list(title = 'Median Household Income'))) %>%
  config(displayModeBar = FALSE)


api_create(scatterplot, filename="scatterplot1")

Conclusion

References and Citations


  1. “Common Core of Data (CCD).” National Center for Education Statistics (NCES) Home Page, a Part of the U.S. Department of Education, National Center for Education Statistics, nces.ed.gov/ccd/drpcompstatelvl.asp.

  2. “Arts & Sciences, Public Policy.” William and Mary, www.wm.edu/as/publicpolicy/schroedercenter/for-faculty/Downloadable%20Health%20Datasets/State%20Level%20Downloadable%20Health%20Datasets/index.php.

  3. Daly, Michael. Documentation: State Variable Longitudinal Dataset [1980 – 2014]. 12 Feb. 2016, www.wm.edu/as/publicpolicy/schroedercenter/for-faculty/Downloadable%20Health%20Datasets/State%20Level% 20Downloadable%20Health%20Datasets/Documentation%20State%20Variable%20Longitudinal%20Data%201980-2014.pdf.

  4. “Bar Charts.” Modern Visualization for the Data Era - Plotly, plot.ly/r/bar-charts/.

  5. National Geographic Society. “United States Regions.” National Geographic Society, 9 Nov. 2012, www.nationalgeographic.org/maps/united-states-regions/.

  6. Anglim, Jeromy. “Formatting Decimal places in R.” Stack Overflow, 27 Aug. 2012, stackoverflow.com/questions/3443687/formatting-decimal-places-in-r.

  7. Tran, Andrew Ba. “Interactive Choropleth Maps.” Interactive Choropleth Maps :: Journalism with R, learn.r-journalism.com/en/mapping/census_maps/census-maps/.

  8. Karambelkar, Bhaskar. “Leaflet.extras.” Function | R Documentation, www.rdocumentation.org/packages/leaflet.extras/versions/1.0.0/topics/addResetMapButton.

  9. Wan, Huiyan. “Add a common Legend for combined ggplots.” Stack Overflow, 27 Oct. 2017, stackoverflow.com/questions/13649473/add-a-common-legend-for-combined-ggplots.